library(phyloseq)
library(ggplot2)
library(dplyr)
library(scales)
library(grid)
library(reshape2)
library(gridExtra)
library(vegan)
library(cowplot)
library(gtable)
library(pander)
library(tidyr)
library(psych)
## objects and functions that will be useful throughout this analysis
# Set the ggplot theme
theme_set(theme_bw())
# Color palette for stations
station_colors = c("red", "#ffa500", "#0080ff")
# Function to order date levels correctly, including Aug 11
order_dates_aug11 <- function(df) {
df$Date <- factor(df$Date,
levels = c("6/16","6/30","7/8","7/14","7/21",
"7/29","8/4","8/11","8/18","8/25","9/2","9/8","9/15",
"9/23","9/29","10/6","10/15","10/20","10/27"))
return(df)
}
# Function to order date levels correctly, not including Aug 11
order_dates <- function(df) {
df$Date <- factor(df$Date,
levels = c("6/16","6/30","7/8","7/14","7/21",
"7/29","8/4", "8/18","8/25","9/2","9/8","9/15",
"9/23","9/29","10/6","10/15","10/20","10/27"))
return(df)
}
# Function to create a named list
# Args: a vector of strings
#
# Returns: a list with the names supplied in the vector
named_list <- function(...){
names <- as.list(substitute(list(...)))[-1L]
result <- list(...)
names(result) <- names
result
}
# Source some useful functions for data normalization
source("~/git_repos/MicrobeMiseq/R/miseqR.R")
load("../../Data/formatted-data/erie-data.RData")
# Inspect erie phyloseq object
erie
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7192 taxa and 53 samples ]
## sample_data() Sample Data: [ 53 samples by 32 sample variables ]
## tax_table() Taxonomy Table: [ 7192 taxa by 9 taxonomic ranks ]
Scale counts to an even depth by dividing by total reads and multiplying by the minimum library size.
## Scale reads in OTU table to even depth
depth = 15000
erie_scale <-
erie %>%
scale_reads(n = depth, round = "round")
Create figure 1, showing pigment concentrations, toxin concentration, and proportion of cyanobacterial reads over time and stations.
# Import metadata file with nutrients, pigments and toxin
nutrient <- read.csv("../../Data/formatted-data/nutrient_cleaned.csv")
# Format nutrient data
nutrient_sub <-
nutrient %>%
filter(!(Date %in% c("5/27", "6/10", "11/3"))) %>%
order_dates_aug11()
# Calculate relative abundance of Cyanobacteria at each date
cyano_abundance <-
erie_scale %>%
tax_glom(taxrank = "Phylum") %>% # conglomerate OTUs to phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # transform to relative abundance
subset_taxa(Phylum == "Cyanobacteria") %>% # Subset to just Cyanobacteria
psmelt() %>% # melt phyloseq object
rename(Cyanobacteria = Abundance) %>%
select(Cyanobacteria, Date, Station)
# Merge cyanobacteria data with nutrient df
bloom_df <-
nutrient_sub %>%
left_join(cyano_abundance, by = c("Station", "Date")) %>%
mutate(Phycocyanin = ifelse(Phycocyanin > 80, 80, Phycocyanin)) %>% # lower extreme values to plot better
select(Station, Date, Phycocyanin, Chla, ParMC, Cyanobacteria) %>%
melt(id.vars = c("Station", "Date")) %>%
order_dates_aug11()
# Make a faceted ggplot of the four bloom variables over time and grouped by station
bloom_plots <- ggplot(bloom_df,
aes(x = Date, y = value, group = Station, color = Station, shape = Station)
) +
facet_grid(variable~., scales = "free_y") +
geom_point(size = 1.3) +
geom_line(size = 1) +
ylab("") +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
scale_color_manual(values = station_colors) +
theme(
strip.background = element_blank(),
strip.text = element_text(size = 11),
axis.title.x = element_blank()
)
# function to extract a legend from a ggplot object
grab_legend <- function(a_ggplot) {
tmp <- ggplot_gtable(ggplot_build(a_ggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
legend
}
station_legend <- grab_legend(bloom_plots)
bloom_plots
ggsave("../../Plots/Figure1.pdf", plot = bloom_plots, width = 6, height = 5)
par(mfrow = c(1,3))
hist(nutrient$Chla)
hist(nutrient$Phycocyanin)
hist(nutrient$ParMC)
These distributions are very non-normal. Let’s try log-scaling
par(mfrow = c(1,3))
hist(log(nutrient$Chla), xlab = "log Chla", main = "")
hist(log(nutrient$Phycocyanin), xlab = "log Phycocyanin", main = "")
hist(log(nutrient$ParMC), xlab = "log ParMC", main = "")
These look better, so we will use these transformed variables for our correlation tests. Since phycocyanin and parmc both have zeroes, we will add a constant to all values. We selected this constant to be small and to minimally impact the distribution shape
par(mfrow = c(1,2))
logChla <- log(nutrient$Chla)
logPhyco <- log(nutrient$Phycocyanin + 0.1)
logParMC <- log(nutrient$ParMC + 0.1)
hist(logPhyco, main = "", xlab = "log Phyco + 0.1")
hist(logParMC, main = "", xlab = "log ParMC + 0.1")
# lets add log-scaled versions of variables to phyloseq object
nutrient$logChla <- logChla
nutrient$logPhyco <- logPhyco
nutrient$logParMC <- logParMC
molec_samples <- as.vector(sample_data(erie)$SampleID)
nutrient_sub <- nutrient %>%
select(logChla, logPhyco, logParMC, SampleID) %>%
filter(SampleID %in% molec_samples)
sample_data(erie)$logChla <- nutrient_sub$logChla
sample_data(erie)$logPhyco <- nutrient_sub$logPhyco
sample_data(erie)$logParMC <- nutrient_sub$logParMC
par(mfrow = c(1,2))
plot(logChla, logPhyco, xlab = "log Chla", ylab = "log Phycocyanin")
# Examine residuals from linear regrerssion
hist(lm(logChla~logPhyco)$residuals, xlab = "residuals", main = "")
# Calculate linear correlation between Chla and phycocyanin for all sites
cor.test(
x = logChla,
y = logPhyco,
alternative = "two.sided",
method = "pearson"
)
##
## Pearson's product-moment correlation
##
## data: logChla and logPhyco
## t = 10.175, df = 55, p-value = 2.985e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6936187 0.8828031
## sample estimates:
## cor
## 0.8081294
It looks like there is a pretty close correlation between chl a and phycocyanin measurements. Assumptions about normality of model residuals are met when log-scaling both variables.
par(mfrow = c(1,2))
plot(logChla, logParMC)
plot(logPhyco, logParMC)
cor.test(
x = logChla,
y = logParMC,
alternative = "two.sided",
method = "pearson"
)
##
## Pearson's product-moment correlation
##
## data: logChla and logParMC
## t = 6.2612, df = 55, p-value = 6.071e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4622297 0.7753393
## sample estimates:
## cor
## 0.6451002
cor.test(
x = logPhyco,
y = logParMC,
alternative = "two.sided",
method = "pearson"
)
##
## Pearson's product-moment correlation
##
## data: logPhyco and logParMC
## t = 11.949, df = 55, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7565981 0.9089839
## sample estimates:
## cor
## 0.8496596
The statistical tests show that there are significant correlations between pigments and toxin, but the plots show that there are several dates in which the toxin levels are 0 but pigments are high, indicating that pigments are not always predictive of toxicity.
Do the nearshore and offshore sites have different bloom pigment concentrations?
nutrient %>%
group_by(Shore) %>%
summarise(median(Chla))
## # A tibble: 2 x 2
## Shore median(Chla)
## <fctr> <dbl>
## 1 nearshore 16.615
## 2 offshore 5.860
wilcox.test(Chla ~ Shore, data = nutrient, alternative = "greater")
##
## Wilcoxon rank sum test with continuity correction
##
## data: Chla by Shore
## W = 551, p-value = 0.0006685
## alternative hypothesis: true location shift is greater than 0
How is phycocyanin related to particulate microcystin levels in the early bloom versus the late bloom?
par(mfrow = c(1,2))
################ early bloom #############################
early <- nutrient %>%
filter(Month %in% c("June", "July", "August"))
# Fit model
fit_early <- lm(log(ParMC + 0.1) ~ log(Phycocyanin + 0.1), data = early)
# Plot model
plot(x = log(early$ParMC + 0.1), y = log(early$Phycocyanin + 0.1))
# Check residuals to make sure they are normal
hist(fit_early$residuals, xlab = "residuals", main = "")
# summary of model
summary(fit_early)
##
## Call:
## lm(formula = log(ParMC + 0.1) ~ log(Phycocyanin + 0.1), data = early)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26150 -0.40679 0.06957 0.36975 1.77811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.59181 0.18230 -8.732 1.76e-09 ***
## log(Phycocyanin + 0.1) 0.94592 0.07619 12.416 6.64e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7367 on 28 degrees of freedom
## Multiple R-squared: 0.8463, Adjusted R-squared: 0.8408
## F-statistic: 154.2 on 1 and 28 DF, p-value: 6.641e-13
par(mfrow = c(1,2))
################ late bloom #############################
late <- nutrient %>%
filter(Month %in% c("September", "October"))
# Plot model
plot(x = log(late$ParMC + 0.1), y = log(late$Phycocyanin + 0.1))
# Fit model
fit_late <- lm(log(ParMC + 0.1) ~ log(Phycocyanin + 0.1), data = late)
# Check residuals to make sure they are normal
hist(fit_late$residuals, xlab = "residuals", main = "")
# summary of model
summary(fit_late)
##
## Call:
## lm(formula = log(ParMC + 0.1) ~ log(Phycocyanin + 0.1), data = late)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.69001 -0.31917 0.07973 0.35119 1.07065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4707 0.1413 -10.411 1.41e-10 ***
## log(Phycocyanin + 0.1) 0.4646 0.0708 6.562 7.11e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.645 on 25 degrees of freedom
## Multiple R-squared: 0.6327, Adjusted R-squared: 0.618
## F-statistic: 43.06 on 1 and 25 DF, p-value: 7.114e-07
Based on these two linear models, we see that the slope estimate for phycocyanin is much lower in the late part of the bloom than the early part of the bloom.
In this figure we will display in part A a barplot with the relative abundance of cyanobacteria genera over time and site. In part B we will break up these groups into OTUs and show lineplots for each non-rare OTU over time (mean rel abundance > 0.0001) and site.
## Select only cyano OTUs that have mean relative abundace > 0.0005
n = 15000
thresh = 0.0005
# Calculate mean relative abundance for each OTU
tax_mean <- taxa_sums(erie_scale)/nsamples(erie_scale)
# Prune low abundance taxa using thresh as mean relative abundance
erie_prune_0001 <-
erie_scale %>%
prune_taxa(tax_mean > thresh*n, .)
# Create a melted data frame of selected cyanobacteria OTUs
cyano_otus <-
erie_prune_0001 %>%
transform_sample_counts(function(x) {x/sum(x)}) %>%
subset_taxa(Class == "Cyanobacteria") %>%
psmelt() %>%
order_dates()
################# Plot A #######################
cyano_genus <-
cyano_otus %>%
group_by(Genus, Date, Station) %>%
summarize(Abundance = sum(Abundance)) %>%
arrange(Genus) %>%
order_dates()
plot2a <- ggplot(cyano_genus, aes(x = Date, y = Abundance, fill = Genus)) +
facet_grid(~Station) +
geom_bar(stat = "identity") +
ylab("rel. abundance") +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
scale_fill_manual(values = c("#755147", "#74b67f", "#3232ff" , "#753376", "#ff9a00")) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 9),
plot.title = element_text(size = 10, face = "bold"),
strip.background = element_blank()
)
genus_legend <- grab_legend(plot2a)
plot2a <- plot2a + theme(legend.position = "none")
# Function to make a ggplot lineplot of an OTU's relative abundance over time
#
# Args:
# df: a melted data frame generated by calling psmelt() on a phyloseq object.
# Contains an "Abundance" column for the OTU's abundance
# otu: the OTU to generate a lineplot for
# taxrank: the taxonomic rank to appear in the plot title (e.g. "Genus")
# Returns:
# a ggplot lineplot
plot_otus <- function(df, otu, taxrank) {
ggplot(df,
aes(x = Date, y = Abundance, group = Station, color = Station, shape = Station)) +
geom_point(size = 2) +
geom_line(size = 0.7) +
ggtitle(paste(df[1, taxrank], otu)) +
ylab("rel. abund") +
scale_color_manual(values = station_colors) +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 9),
legend.position = "none",
plot.title = element_text(size = 10, face = "bold")
)
}
##################### Plot B #############################
cyano_otu_names <- as.list(levels(cyano_otus$Species))
names(cyano_otu_names) <- levels(cyano_otus$Species)
# Generate a lineplot for each cyanobacteria with mean relative abundance > 0.0001
cyano_otu_plots <- lapply(cyano_otu_names,
function(otu) {
df_otu <- filter(cyano_otus, OTU == otu)
plot <- plot_otus(df = df_otu, otu = otu, taxrank = "Genus")
return(plot)
}
)
plot2b <- arrangeGrob(
cyano_otu_plots$Otu00007, cyano_otu_plots$Otu00177, cyano_otu_plots$Otu00005,
cyano_otu_plots$Otu00044, cyano_otu_plots$Otu00304, cyano_otu_plots$Otu00037,
cyano_otu_plots$Otu00147, cyano_otu_plots$Otu00193, cyano_otu_plots$Otu00049,
ncol = 3, nrow = 3
)
##################### Compile plots #####################################
ggdraw() +
draw_plot(plot2a, x = 0.03, y = 0.7, width = .8, height = 0.28) +
draw_plot(genus_legend, x = 0.83, y = .76, width = .16, height = .18) +
draw_plot(plot2b, x = 0.03, y = 0.02, width = .8, height = 0.64) +
draw_plot(station_legend, x = 0.80, y = .5, width = .18, height = .18) +
draw_plot_label(c("A", "B"), c(0.02, 0.02), c(0.97, 0.68), size = 14)
ggsave("../../Plots/Figure2.pdf", plot = bloom_plots, width = 10, height = 8)
What is the most abundant genus?
cyano_genus %>%
group_by(Genus) %>%
summarise(median(Abundance))
## # A tibble: 5 x 2
## Genus median(Abundance)
## <fctr> <dbl>
## 1 Anabaena 0.000000000
## 2 Microcystis 0.024369748
## 3 Pseudanabaena 0.002794857
## 4 Synechococcus 0.038433382
## 5 unclassified 0.002680188
What if we look at each site separately?
cyano_genus %>%
group_by(Genus, Station) %>%
summarise(median(Abundance))
## Source: local data frame [15 x 3]
## Groups: Genus [?]
##
## Genus Station median(Abundance)
## <fctr> <fctr> <dbl>
## 1 Anabaena nearshore1 0.0000000000
## 2 Anabaena nearshore2 0.0000000000
## 3 Anabaena offshore 0.0000000000
## 4 Microcystis nearshore1 0.0318316878
## 5 Microcystis nearshore2 0.0413849336
## 6 Microcystis offshore 0.0070832091
## 7 Pseudanabaena nearshore1 0.0026464855
## 8 Pseudanabaena nearshore2 0.0043612088
## 9 Pseudanabaena offshore 0.0009692812
## 10 Synechococcus nearshore1 0.0327130113
## 11 Synechococcus nearshore2 0.0382854263
## 12 Synechococcus offshore 0.0476329846
## 13 unclassified nearshore1 0.0042086796
## 14 unclassified nearshore2 0.0045272706
## 15 unclassified offshore 0.0019813605
Which Cyanobacteria are associated with each other? Spearman’s rho correlations are shown and significant correlations are bolded.
# Create a phyloseq object of only cyanobacteria OTUs
cyano_physeq <-
erie_prune_0001 %>%
transform_sample_counts(function(x) {x/sum(x)}) %>%
subset_taxa(Class == "Cyanobacteria")
taxa_names(cyano_physeq) <- paste(
tax_table(cyano_physeq)[ , "Genus"],
taxa_names(cyano_physeq)
)
# Run pairwise spearman correlation tests between all cyanobacteria genera.
cyano_corrs_spearman <- corr.test(
t(otu_table(cyano_physeq)),
use = "complete",
method = "spearman",
adjust = "fdr"
)
emphasize.strong.cells(which(cyano_corrs_spearman$p < 0.05, arr.ind = TRUE))
pander(signif(cyano_corrs_spearman$r, digits = 2))
| Â | Microcystis Otu00005 | Synechococcus Otu00007 |
|---|---|---|
| Microcystis Otu00005 | 1 | -0.063 |
| Synechococcus Otu00007 | -0.063 | 1 |
| Pseudanabaena Otu00037 | 0.77 | 0.13 |
| Synechococcus Otu00044 | 0.22 | 0.46 |
| unclassified Otu00049 | 0.63 | 0.18 |
| Synechococcus Otu00147 | -0.23 | 0.6 |
| Synechococcus Otu00177 | 0.49 | 0.088 |
| Anabaena Otu00193 | 0.56 | -0.052 |
| unclassified Otu00304 | 0.57 | 0.027 |
| Â | Pseudanabaena Otu00037 | Synechococcus Otu00044 |
|---|---|---|
| Microcystis Otu00005 | 0.77 | 0.22 |
| Synechococcus Otu00007 | 0.13 | 0.46 |
| Pseudanabaena Otu00037 | 1 | 0.28 |
| Synechococcus Otu00044 | 0.28 | 1 |
| unclassified Otu00049 | 0.68 | 0.52 |
| Synechococcus Otu00147 | 0.086 | 0.31 |
| Synechococcus Otu00177 | 0.6 | 0.25 |
| Anabaena Otu00193 | 0.26 | 0.092 |
| unclassified Otu00304 | 0.61 | -0.022 |
| Â | unclassified Otu00049 | Synechococcus Otu00147 |
|---|---|---|
| Microcystis Otu00005 | 0.63 | -0.23 |
| Synechococcus Otu00007 | 0.18 | 0.6 |
| Pseudanabaena Otu00037 | 0.68 | 0.086 |
| Synechococcus Otu00044 | 0.52 | 0.31 |
| unclassified Otu00049 | 1 | -0.16 |
| Synechococcus Otu00147 | -0.16 | 1 |
| Synechococcus Otu00177 | 0.74 | -0.27 |
| Anabaena Otu00193 | 0.47 | -0.51 |
| unclassified Otu00304 | 0.62 | -0.2 |
| Â | Synechococcus Otu00177 | Anabaena Otu00193 |
|---|---|---|
| Microcystis Otu00005 | 0.49 | 0.56 |
| Synechococcus Otu00007 | 0.088 | -0.052 |
| Pseudanabaena Otu00037 | 0.6 | 0.26 |
| Synechococcus Otu00044 | 0.25 | 0.092 |
| unclassified Otu00049 | 0.74 | 0.47 |
| Synechococcus Otu00147 | -0.27 | -0.51 |
| Synechococcus Otu00177 | 1 | 0.35 |
| Anabaena Otu00193 | 0.35 | 1 |
| unclassified Otu00304 | 0.45 | 0.44 |
| Â | unclassified Otu00304 |
|---|---|
| Microcystis Otu00005 | 0.57 |
| Synechococcus Otu00007 | 0.027 |
| Pseudanabaena Otu00037 | 0.61 |
| Synechococcus Otu00044 | -0.022 |
| unclassified Otu00049 | 0.62 |
| Synechococcus Otu00147 | -0.2 |
| Synechococcus Otu00177 | 0.45 |
| Anabaena Otu00193 | 0.44 |
| unclassified Otu00304 | 1 |
# My own subsetting function, similar to phyloseq::subset_taxa, except taxa can
# be passed as arguments within functions without weird environment errors
#
# Args:
# physeq: a phyloseq object
# taxrank: taxonomic rank to filter on
# taxa: a vector of taxa groups to filter on
#
# Returns:
# a phyloseq object subsetted to the x taxa in taxrank
my_subset_taxa <- function(physeq, taxrank, taxa) {
physeq_tax_sub <- tax_table(physeq)[tax_table(physeq)[ , taxrank] %in% taxa, ]
tax_table(physeq) <- physeq_tax_sub
return(physeq)
}
Here we estimate alpha diversity by sampling with replacement 100x and averaging OTU richness and Simpson’s E over each of the trials
# Initialize parameters
trials = 100
min_lib = min(sample_sums(erie)) # Depth we are rarefying to
# Groups to estimate alpha diversity for
mytaxa <- c(
"Bacteria", "NcBacteria", "Actinobacteria", "Alphaproteobacteria",
"Betaproteobacteria", "Bacteroidetes", "Gammaproteobacteria",
"Deltaproteobacteria", "Verrucomicrobia"
)
names(mytaxa) <- mytaxa
# Taxonomic ranks of mytaxa
mytaxa_taxrank <- c(
"Kingdom", "Class", "Phylum", "Class", "Class",
"Phylum", "Class", "Class", "Phylum"
)
names(mytaxa_taxrank) <- mytaxa
# Data frame to hold alpha diversity estimates over trials
alphadiv_df <- data.frame(matrix(nrow = nsamples(erie), ncol = trials))
# Initialize empty df's for richness and evenness of all taxa in mytaxa
richness <- lapply(mytaxa, function(x) {return(alphadiv_df)} )
simpson <- lapply(mytaxa, function(x) {return(alphadiv_df)} )
alphadiv_list <- list(richness = richness, simpson = simpson)
# It is always important to set a seed when you subsample so your result is replicable
set.seed(3)
# Run trials to subsample and estimate diversity
for (i in 1:trials) {
# Subsample
rarefied_physeq <- rarefy_even_depth(erie, sample.size = min_lib, verbose = FALSE, replace = TRUE)
# Generate alpha-diversity estimates for each taxonomic group
for (t in mytaxa) {
# Subset physeq object to taxa in mytaxa
if (t != "NcBacteria") {
physeq_sub <- my_subset_taxa(
physeq = rarefied_physeq,
taxrank = mytaxa_taxrank[t],
taxa = t
)
} else {
physeq_sub <- subset_taxa(physeq = rarefied_physeq, Class != "Cyanobacteria")
}
# Calculate observed richness for that group and store value in a df column
richness <- estimate_richness(physeq_sub, measures = "Observed")[ ,1]
alphadiv_list$richness[[t]][ ,i] <- richness
# Calculate Simpson's E for that group and store value in a df column
alphadiv_list$simpson[[t]][ ,i] <- (estimate_richness(physeq_sub, measures = "InvSimpson")[ ,1]/richness)
}
}
# Calculate the means of richness and inverse simpson from the 100 trials
alphadiv_est <- lapply(alphadiv_list, function(div_measure) {
lapply(div_measure, function(taxa_group) {
alpha_mean <- rowMeans(taxa_group)
return(alpha_mean)
})
})
# Convert alphadiv_est richness and simpson's E lists into wide data frames
l <- lapply(alphadiv_est, function(x) {
# convert from list to data.frame
est_df <- plyr::ldply(.data = x, .fun = data.frame)
names(est_df) <- c("Taxa", "Diversity")
# Add in SampleID column and spread to wide format
r <- est_df %>%
mutate(SampleID = rep(sample_names(erie), length(mytaxa)))
return(r)
})
# Merge sample metadata with these estimates
merge_dat <- data.frame(sample_data(erie)) %>%
select(SampleID, logChla, pH, logPhyco, TP, Turbidity, Station, Date, Days) %>%
mutate(logTP = log(TP)) %>%
mutate(logTurb = log(Turbidity))
# Create a df with a "Diversity" column that includes richness and inv. simpson,
# and log-chl a values from erie sample_data
alpha_comb <- l$richness %>%
left_join(y = l$simpson, by = c("Taxa", "SampleID")) %>% # Join the richness and inv_simp df's
rename(Richness = Diversity.x, Simpson = Diversity.y) %>% # rename columns to avoid confusion
left_join(merge_dat, by = "SampleID") %>% # Join with merged nutrient data
gather(key = "Alphadiv", value = "Estimate", Richness, Simpson) %>%
order_dates()
# Function to test whether there is a linear relationship
# between log chla and alpha diversity of a group
#
# Args:
# df: a data frame with columns for diversity estimate and bloom variable (e.g. logChla, logPhyco)
#
# Returns:
# a vector with the pvalue and R2 of the linear model
test_alphadiv_pp <- function(df, bloomvar) {
df_sub <- na.omit(df[, c(bloomvar, "Estimate")])
# Fit a linear model
fit <- lm(reformulate(termlabels = bloomvar, response = "Estimate"), data = df_sub)
# Grab model outputs
fit_pvalue <- summary(fit)$coef[2,4]
fit_r2 <- summary(fit)$r.squared
return(c(fit_pvalue, fit_r2))
}
# Function to make a ggplot scatterplot of logChla vs an alpha-diversity metric.
# If the pvalue is below 0.05, it will also plot the fitted line
#
# Args:
# df: a melted data frame with a column called logChla and value for alpha-diversity
# measure: Alpha-diversity measure (e.g. "InvSimpson" or "Observed")
# group: Taxonomic group to plot (e.g. "Betaproteobacteria")
# pvalue: pvalue from linear model returned by test_alphadiv_pp
# r2: r2 from linear model returned by test_alphadiv_pp
#
# Returns:
# a ggplot
make_alphadiv_plot <- function(df, bloomvar, measure, group, pvalue, p_x, r2) {
g <- ggplot(df, aes_string(x = bloomvar, y = "Estimate")) +
geom_point() +
ylab(measure) +
ggtitle(group)
# Since we rounded to 3 sigfigs, estimates of 0 need to actually say "p < 0.001"
if (pvalue == 0) {
g <- g + annotate(
"text",
x = p_x,
y = max(df$Estimate) - 0.03*max(df$Estimate),
size = 3,
label = "p < 0.001"
)
} else if (pvalue < 0.05 & pvalue != 0) {
g <- g + annotate(
"text",
x = p_x,
y = max(df$Estimate) - 0.03*max(df$Estimate),
size = 3,
label = paste("p =", pvalue)
)
}
if (pvalue < 0.05) {
g <- g +
annotate(
"text",
x = p_x,
y = max(df$Estimate) - 0.08*max(df$Estimate),
size = 3,
label = paste("R2 =", r2)
) +
geom_smooth(method = "lm", size = 1)
}
return(g)
}
divs <- named_list("Richness", "Simpson")
# apply alpha div test to each diversity index for each group
alpha_models <- lapply(divs, function(d) {
alpha_sub <- alpha_comb %>% filter(Alphadiv == d)
lapply(mytaxa, function(t) {
alpha_sub <- alpha_sub %>% filter(Taxa == t)
# Fit linear model
fit <- test_alphadiv_pp(alpha_sub, "logChla")
return(fit)
})
})
# Unlist
alpha_results <- lapply(alpha_models, function(x) {
f <- x %>%
unlist(use.names = FALSE) %>%
matrix(
nrow = length(mytaxa),
ncol = 2,
byrow = TRUE,
dimnames = list(mytaxa, c("pvalue","r2"))
)
# fdr correction on pvalues
f[ ,1] <- p.adjust(f[ ,1], method = "fdr")
# Round to three significant digits
f <- round(f, digits = 3)
})
## Make plots for Simpson's E vs log chla
simp_plots <- list()
for (i in 1:length(mytaxa)) {
df <- filter(alpha_comb, Taxa == mytaxa[i]) %>%
filter(Alphadiv == "Simpson")
simp_plots[[i]] <- make_alphadiv_plot(
df = df,
bloomvar = "logChla",
measure = "Simpson's E",
group = mytaxa[i],
pvalue = alpha_results$Simpson[i, 1],
p_x = 0.5,
r2 = alpha_results$Simpson[i, 2]
)
}
# Fit quadratic to alphaproteo evenness
alphaproteo <- filter(alpha_comb, Taxa == "Alphaproteobacteria") %>%
filter(Alphadiv == "Simpson")
quad_fit <- lm(Estimate ~ logChla + I(logChla^2), data = alphaproteo)
summary(quad_fit)
##
## Call:
## lm(formula = Estimate ~ logChla + I(logChla^2), data = alphaproteo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.097202 -0.036048 -0.004634 0.031577 0.141354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.097811 0.030413 3.216 0.002280 **
## logChla 0.101997 0.028571 3.570 0.000800 ***
## I(logChla^2) -0.021940 0.006266 -3.501 0.000984 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05017 on 50 degrees of freedom
## Multiple R-squared: 0.2035, Adjusted R-squared: 0.1716
## F-statistic: 6.387 on 2 and 50 DF, p-value: 0.003387
simp_plots[[4]] <- simp_plots[[4]] +
stat_smooth(method = "lm", formula = y ~ poly(x, 2)) +
annotate(
"text",
x = 0.9,
y = .34,
size = 3,
label = paste("p < 0.001")
) +
annotate(
"text",
x = 0.9,
y = .31,
size = 3,
label = paste("R2 =", round(summary(quad_fit)$adj.r.squared, 3))
)
## Make plots for observed richness vs log chla
rich_plots <- list()
for (i in 1:length(mytaxa)) {
df <- filter(alpha_comb, Taxa == mytaxa[i]) %>%
filter(Alphadiv == "Richness")
rich_plots[[i]] <- make_alphadiv_plot(
df = df,
bloomvar = "logChla",
measure = "Obs. Richness",
group = mytaxa[i],
pvalue = alpha_results$Richness[i, 1],
p_x = 0.5,
r2 = alpha_results$Richness[i, 2]
)
}
Seasonal alpha diversity plots
alphadiv_ncbacteria_simpson <- alpha_comb %>%
filter(Taxa == "NcBacteria") %>%
filter(Alphadiv == "Simpson") %>%
order_dates()
seasonE <- ggplot(alphadiv_ncbacteria_simpson, aes(x = Date, y = Estimate, group = Station, color = Station, shape = Station)) +
geom_point() +
geom_line() +
scale_color_manual(values = station_colors) +
ylab("Simpson's E") +
xlab("") +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
theme(legend.position = "none") +
ggtitle("Nc-Bacteria")
alphadiv_ncbacteria_rich <- alpha_comb %>%
filter(Taxa == "NcBacteria") %>%
filter(Alphadiv == "Richness") %>%
order_dates()
seasonRich <- ggplot(alphadiv_ncbacteria_rich, aes(x = Date, y = Estimate, group = Station, color = Station, shape = Station)) +
geom_point() +
geom_line() +
scale_color_manual(values = station_colors) +
ylab("Obs. Richness") +
xlab("") +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
theme(legend.position = "none") +
ggtitle("Nc-Bacteria")
## Arrange plots for final figure
ggdraw() +
draw_plot(seasonRich, x = 0.01, y = 0.73, width = 0.48, height = 0.24) +
draw_plot(rich_plots[[4]], x = 0.01, y = 0.49, width = 0.48, height = 0.24) +
draw_plot(rich_plots[[5]], x = 0.01, y = 0.25, width = 0.48, height = 0.24) +
draw_plot(rich_plots[[6]], x = 0.01, y = 0.01, width = 0.48, height = 0.24) +
draw_plot(seasonE, x = 0.52, y = 0.73, width = 0.48, height = 0.24) +
draw_plot(simp_plots[[4]], x = 0.52, y = 0.49, width = 0.48, height = 0.24) +
draw_plot(simp_plots[[5]], x = 0.52, y = 0.25, width = 0.48, height = 0.24) +
draw_plot(simp_plots[[6]], x = 0.52, y = 0.01, width = 0.48, height = 0.24) +
draw_plot_label(c("A", "B", "C", "D", "E", "F", "G", "H"), c(0, 0, 0, 0, .52, .52, .52, .52), c(0.99, 0.74, 0.5, 0.26), size = 14)
ggsave("../../Plots/Figure3.pdf", height = 10, width = 7)
Other bloom measurements
cor.test(nutrient$logChla, nutrient$pH)
##
## Pearson's product-moment correlation
##
## data: nutrient$logChla and nutrient$pH
## t = 9.149, df = 52, p-value = 2.044e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6554519 0.8701501
## sample estimates:
## cor
## 0.7853757
######## pH ###########
# apply alpha div test to each diversity index for each group
alpha_models <- lapply(divs, function(d) {
alpha_sub <- alpha_comb %>% filter(Alphadiv == d)
lapply(mytaxa, function(t) {
alpha_sub <- alpha_sub %>% filter(Taxa == t)
# Fit linear model
fit <- test_alphadiv_pp(alpha_sub, "pH")
return(fit)
})
})
# Unlist
alpha_results <- lapply(alpha_models, function(x) {
f <- x %>%
unlist(use.names = FALSE) %>%
matrix(
nrow = length(mytaxa),
ncol = 2,
byrow = TRUE,
dimnames = list(mytaxa, c("pvalue","r2"))
)
# fdr correction on pvalues
f[ ,1] <- p.adjust(f[ ,1], method = "fdr")
# Round to three significant digits
f <- round(f, digits = 3)
})
simp_plots_ph <- list()
for (i in 1:length(mytaxa)) {
df <- filter(alpha_comb, Taxa == mytaxa[i]) %>%
filter(Alphadiv == "Simpson")
simp_plots_ph[[i]] <- make_alphadiv_plot(
df = df,
bloomvar = "pH",
measure = "Simpson's E",
group = mytaxa[i],
pvalue = alpha_results$Simpson[i, 1],
p_x = 8.1,
r2 = alpha_results$Simpson[i, 2]
)
}
# Fit quadratic to alphaproteo evenness
alphaproteo <- filter(alpha_comb, Taxa == "Alphaproteobacteria") %>%
filter(Alphadiv == "Simpson")
quad_fit <- lm(Estimate ~ pH + I(pH^2), data = alphaproteo)
summary(quad_fit)
##
## Call:
## lm(formula = Estimate ~ pH + I(pH^2), data = alphaproteo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.125636 -0.031979 -0.003435 0.029431 0.132886
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.55867 3.55752 -3.530 0.000941 ***
## pH 3.01789 0.83005 3.636 0.000686 ***
## I(pH^2) -0.17817 0.04835 -3.685 0.000591 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04824 on 47 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.2789, Adjusted R-squared: 0.2482
## F-statistic: 9.087 on 2 and 47 DF, p-value: 0.0004607
simp_plots_ph[[4]] <- simp_plots_ph[[4]] +
stat_smooth(method = "lm", formula = y ~ poly(x, 2)) +
annotate(
"text",
x = 8.1,
y = .34,
size = 3,
label = paste("p < 0.001")
) +
annotate(
"text",
x = 8.1,
y = .31,
size = 3,
label = paste("R2 =", round(summary(quad_fit)$adj.r.squared, 3))
)
grid.arrange(simp_plots_ph[[4]], simp_plots_ph[[5]], simp_plots_ph[[6]])
############# Phycocyanin #############
# apply alpha div test to each diversity index for each group
alpha_models <- lapply(divs, function(d) {
alpha_sub <- alpha_comb %>% filter(Alphadiv == d)
lapply(mytaxa, function(t) {
alpha_sub <- alpha_sub %>% filter(Taxa == t)
# Fit linear model
fit <- test_alphadiv_pp(alpha_sub, "logPhyco")
return(fit)
})
})
# Unlist
alpha_results <- lapply(alpha_models, function(x) {
f <- x %>%
unlist(use.names = FALSE) %>%
matrix(
nrow = length(mytaxa),
ncol = 2,
byrow = TRUE,
dimnames = list(mytaxa, c("pvalue","r2"))
)
# fdr correction on pvalues
f[ ,1] <- p.adjust(f[ ,1], method = "fdr")
# Round to three significant digits
f <- round(f, digits = 3)
})
simp_plots_phyco <- list()
for (i in 1:length(mytaxa)) {
df <- filter(alpha_comb, Taxa == mytaxa[i]) %>%
filter(Alphadiv == "Simpson")
simp_plots_phyco[[i]] <- make_alphadiv_plot(
df = df,
bloomvar = "logPhyco",
measure = "Simpson's E",
group = mytaxa[i],
pvalue = alpha_results$Simpson[i, 1],
p_x = -1.5,
r2 = alpha_results$Simpson[i, 2]
)
}
# Fit quadratic to alphaproteo evenness
alphaproteo <- filter(alpha_comb, Taxa == "Alphaproteobacteria") %>%
filter(Alphadiv == "Simpson")
quad_fit <- lm(Estimate ~ logPhyco + I(logPhyco^2), data = alphaproteo)
summary(quad_fit)
##
## Call:
## lm(formula = Estimate ~ logPhyco + I(logPhyco^2), data = alphaproteo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.116487 -0.036608 -0.009642 0.033543 0.136508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.209797 0.009189 22.831 <2e-16 ***
## logPhyco 0.005633 0.007166 0.786 0.4355
## I(logPhyco^2) -0.004930 0.002284 -2.158 0.0357 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05253 on 50 degrees of freedom
## Multiple R-squared: 0.1268, Adjusted R-squared: 0.09183
## F-statistic: 3.629 on 2 and 50 DF, p-value: 0.03375
simp_plots_phyco[[4]] <- simp_plots_phyco[[4]] +
stat_smooth(method = "lm", formula = y ~ poly(x, 2)) +
annotate(
"text",
x = -1.5,
y = .34,
size = 3,
label = paste("p = 0.035")
) +
annotate(
"text",
x = -1.5,
y = .31,
size = 3,
label = paste("R2 =", round(summary(quad_fit)$adj.r.squared, 3))
)
grid.arrange(simp_plots_phyco[[4]], simp_plots_phyco[[5]], simp_plots_phyco[[6]])
# Subset to non-cyanobacteria and scale internally
ncbacteria <-
erie %>%
subset_taxa(Class != "Cyanobacteria") %>%
scale_reads(round = "round")
# Generate pcoa scores for each subset
ncbact_pcoa <- ordinate(
physeq = ncbacteria,
method = "PCoA",
distance = "bray"
)
# Generate a df to plot pcoa for each subset
pcoa_df <- plot_ordination(
physeq = ncbacteria,
axes = 1:3,
ordination = ncbact_pcoa,
justDF = TRUE
)
pcoa_df <- pcoa_df %>%
rename(PC1 = Axis.1, PC2 = Axis.2, PC3 = Axis.3) %>%
order_dates() %>%
# Flip orientation of PC2 for Cyanobacteria (does not affect interpretation)
mutate(PC2 = -PC2)
# Generate relative, lingoes-corrected eigenvalues for PC1 and PC2
pcs <- c(
PC1 = signif(ncbact_pcoa$values$Rel_corr_eig[1]*100, 3),
PC2 = signif(ncbact_pcoa$values$Rel_corr_eig[2]*100, 3),
PC3 = signif(ncbact_pcoa$values$Rel_corr_eig[3]*100, 3)
)
# Function to create a plot of time (x-axis) vs PC scores (y-axis)
plot_pcts <- function(df, pc, eigs) {
ggplot(df,
aes_string(x = "Date", y = pc, group = "Station", color = "Station", shape = "Station")) +
geom_point(size = 2.5) +
geom_line(size = 1.1) +
scale_color_manual(values = station_colors) +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
ylab(paste(pc, " ", eigs[pc], "%")) +
theme(
axis.title.x = element_blank(),
plot.title = element_text(face = "bold", size = 16),
legend.position = "none",
plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm")
)
}
# Non-cyano PC time series plots
ncbact_pcs <- named_list("PC1", "PC2", "PC3")
ncbact_pc_plots <- lapply(ncbact_pcs,
function(x) {
plot_pcts(pcoa_df, x, eigs = pcs)
}
)
# Plot for pH
ph_plot <- ggplot(pcoa_df, aes(x = PC1, y = pH)) +
geom_point(size = 2.5, aes(shape = Station, color = Station)) +
scale_color_manual(values = station_colors) +
geom_smooth(method = "lm", color = "black") +
theme(legend.position = "none")
# Plot for Temperature
temp_plot <- ggplot(pcoa_df, aes(x = PC2, y = Temp)) +
geom_point(size = 2.5, aes(shape = Station, color = Station)) +
scale_color_manual(values = station_colors) +
geom_smooth(method = "lm", color = "black") +
theme(legend.position = "none") +
ylab("Temperature")
# Plot for SpCond
spcond_plot <- ggplot(pcoa_df, aes(x = PC3, y = SpCond)) +
geom_point(size = 2.5, aes(shape = Station, color = Station)) +
scale_color_manual(values = station_colors) +
geom_smooth(method = "lm", color = "black") +
theme(legend.position = "none") +
ylab("Specific Conductivity")
ggdraw() +
## non-cyano
draw_plot(ncbact_pc_plots$PC1, x = 0.02, y = 0.5, width = 0.3, height = 0.42) +
draw_plot(ncbact_pc_plots$PC2, x = 0.34, y = 0.5, width = 0.3, height = 0.42) +
draw_plot(ncbact_pc_plots$PC3, x = 0.66, y = 0.5, width = 0.3, height = 0.42) +
draw_plot(ph_plot, x = 0.02, y = 0.0, width = 0.3, height = 0.42) +
draw_plot(temp_plot, x = 0.34, y = 0.0, width = 0.3, height = 0.42) +
draw_plot(spcond_plot, x = 0.66, y = 0.0, width = 0.3, height = 0.42)
How many PCoA dimensions should we examine?
plot(1:nrow(ncbact_pcoa$values), ncbact_pcoa$values$Rel_corr_eig, xlab =
"Principal Coordinate", ylab = "Relative eigenvalue")
How good is my PCoA in three dimensions?
pcoa_dist <- as.vector(dist(ncbact_pcoa$vectors[,1:3]))
bray_dist <- as.vector(phyloseq::distance(ncbacteria, method = "bray"))
plot(bray_dist, pcoa_dist) +
abline(lm(pcoa_dist~bray_dist), col = "blue")
## numeric(0)
summary(lm(pcoa_dist~bray_dist))
##
## Call:
## lm(formula = pcoa_dist ~ bray_dist)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.307054 -0.027616 0.007518 0.036637 0.124308
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.248702 0.005806 -42.83 <2e-16 ***
## bray_dist 1.237353 0.011810 104.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05377 on 1376 degrees of freedom
## Multiple R-squared: 0.8886, Adjusted R-squared: 0.8885
## F-statistic: 1.098e+04 on 1 and 1376 DF, p-value: < 2.2e-16
What is the max bray-curtis between time points for each site?
# Histogram of bray-curtis distance
hist(bray_dist)
# Nearshore 1
n1 <- subset_samples(erie_scale, Station == "nearshore1")
n1_bdist <- phyloseq::distance(n1, method = "bray")
max(as.matrix(n1_bdist)[1,])
## [1] 0.783699
# Nearshore 2
n2 <- subset_samples(erie_scale, Station == "nearshore2")
n2_bdist <- phyloseq::distance(n2, method = "bray")
max(as.matrix(n2_bdist)[1,])
## [1] 0.8121858
# Offshore
o <- subset_samples(erie_scale, Station == "offshore")
o_bdist <- phyloseq::distance(o, method = "bray")
max(as.matrix(o_bdist)[1,])
## [1] 0.6426857
What is the bray-curtis between the first and last time points
# Nearshore 1
n1 <- subset_samples(erie_scale, Station == "nearshore1" & Date %in% c("6/16", "10/27"))
n1_bdist <- phyloseq::distance(n1, method = "bray")
n1_bdist
## E0012_CNA
## E0132_CNA 0.4601162
# Nearshore 2
n2 <- subset_samples(erie_scale, Station == "nearshore2" & Date %in% c("6/16", "10/27"))
n2_bdist <- phyloseq::distance(n2, method = "bray")
n2_bdist
## E0010_CNA
## E0130_CNA 0.4530431
# Offshore
o <- subset_samples(erie_scale, Station == "offshore" & Date %in% c("6/16", "10/27"))
o_bdist <- phyloseq::distance(o, method = "bray")
o_bdist
## E0014_CNA
## E0134_CNA 0.3639581
# Variables to include in cyano models
cyano_vars <- c("Nitrate", "SRP", "Temp", "H2O2", "SpCond", "Ammonia", "Turbidity", "TP", "Days")
# Examine distributions of potential env variables to normalize
par(mfrow = c(3,3))
for (var in cyano_vars) {
hist(nutrient[ ,var], main = "", xlab = var, ylab = "" )
}
# Adjusted variables to include in cyano models
cyano_vars <- c("sqrt(Nitrate)", "log(SRP)", "Temp", "log(H2O2)", "SpCond", "log(Ammonia)", "log(Turbidity)", "log(TP)", "Days")
# Variables to include in nc-bacteria models
non_cyano_vars <- c(cyano_vars, "pH", "logParMC", "logChla", "logPhyco")
# Impute SpCond values for nearshore 1 on Sep 2 and Sep 8 with value for nearshore 2
# Change 9/2 value
pcoa_df$SpCond[pcoa_df$Date == "9/2" & pcoa_df$Station == "nearshore1"] <-
pcoa_df$SpCond[pcoa_df$Date == "9/2" & pcoa_df$Station == "nearshore2"]
# Change 9/8 value
pcoa_df$SpCond[pcoa_df$Date == "9/8" & pcoa_df$Station == "nearshore1"] <-
pcoa_df$SpCond[pcoa_df$Date == "9/8" & pcoa_df$Station == "nearshore2"]
get_models <- function(vars, response, dat) {
models <- list()
for (var in vars) {
models[[var]] <- lm(reformulate(termlabels = var, response = response), data = dat)
}
return(models)
}
ncbact_pcs <- named_list("PC1", "PC2", "PC3")
# Get the simple linear models for each variable along each PC
non_cyano_models <- lapply(ncbact_pcs, function(x) {
get_models(non_cyano_vars, x, dat = pcoa_df)
})
# Get rsquared for each model
rsquared <- lapply(non_cyano_models, function(x) {
lapply(x, function(y) {
r2 <- summary(y)$r.squared
return(r2)
})
})
rsquared_df <- rbind(data.frame(rsquared$PC1), data.frame(rsquared$PC2), data.frame(rsquared$PC3))
rsquared_df$PC <- c("PC1", "PC2", "PC3")
rsquared_df
## sqrt.Nitrate. log.SRP. Temp log.H2O2. SpCond log.Ammonia.
## 1 0.34015898 0.1167513 0.01044875 0.006574739 7.410362e-05 0.152346552
## 2 0.21463152 0.4557471 0.82567786 0.158273776 7.386702e-02 0.001826483
## 3 0.05033012 0.1076665 0.02063832 0.017335249 4.510982e-01 0.004945428
## log.Turbidity. log.TP. Days pH logParMC logChla
## 1 0.50543645 0.47435006 0.0556560685 0.39784576 0.30182961 0.43430566
## 2 0.01875346 0.02587116 0.7426635978 0.25433814 0.04398430 0.19320660
## 3 0.12044041 0.16451430 0.0008503211 0.07749959 0.02609096 0.04661143
## logPhyco PC
## 1 0.425056608 PC1
## 2 0.059674067 PC2
## 3 0.001428595 PC3
# Get rsquared for each model
pvalue <- lapply(non_cyano_models, function(x) {
lapply(x, function(y) {
p <- summary(y)$coefficients[2,4]
return(p)
})
})
pvalue_df <- rbind(data.frame(pvalue$PC1), data.frame(pvalue$PC2), data.frame(pvalue$PC3))
pvalue_df$PC <- c("PC1", "PC2", "PC3")
pvalue_df
## sqrt.Nitrate. log.SRP. Temp log.H2O2. SpCond
## 1 4.583466e-06 1.227683e-02 4.664099e-01 0.563816708 9.512187e-01
## 2 4.768864e-04 2.953102e-08 5.500437e-21 0.003177153 4.899057e-02
## 3 1.063189e-01 1.645540e-02 3.047689e-01 0.347336859 3.685796e-08
## log.Ammonia. log.Turbidity. log.TP. Days pH
## 1 0.00386036 2.449999e-09 1.194196e-08 8.901190e-02 9.117569e-07
## 2 0.76124022 3.281703e-01 2.499139e-01 1.190187e-16 1.882576e-04
## 3 0.61680975 1.089803e-02 2.585948e-03 8.357976e-01 5.027727e-02
## logParMC logChla logPhyco PC
## 1 2.041148e-05 8.088317e-08 1.235285e-07 PC1
## 2 1.317525e-01 9.910694e-04 7.793199e-02 PC2
## 3 2.478855e-01 1.205073e-01 7.881612e-01 PC3
Best models:
pc1_ph <- lm(PC1 ~ pH + Days, data = pcoa_df)
summary(pc1_ph)
##
## Call:
## lm(formula = PC1 ~ pH + Days, data = pcoa_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.294635 -0.070383 0.000344 0.085555 0.244193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.1660354 0.4454814 -9.352 2.66e-12 ***
## pH 0.4593039 0.0508065 9.040 7.49e-12 ***
## Days 0.0031160 0.0004867 6.402 6.62e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1265 on 47 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.6783, Adjusted R-squared: 0.6646
## F-statistic: 49.55 on 2 and 47 DF, p-value: 2.657e-12
pc1_turb <- lm(PC1 ~ Turbidity + Days, data = pcoa_df)
summary(pc1_turb)
##
## Call:
## lm(formula = PC1 ~ Turbidity + Days, data = pcoa_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31893 -0.08260 -0.01263 0.06917 0.37056
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2316885 0.0551331 -4.202 0.000109 ***
## Turbidity 0.0186560 0.0027340 6.824 1.13e-08 ***
## Days 0.0006826 0.0005381 1.269 0.210477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.155 on 50 degrees of freedom
## Multiple R-squared: 0.511, Adjusted R-squared: 0.4915
## F-statistic: 26.13 on 2 and 50 DF, p-value: 1.706e-08
pc2 <- lm(PC2 ~ Temp, data = pcoa_df)
summary(pc2)
##
## Call:
## lm(formula = PC2 ~ Temp, data = pcoa_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.149786 -0.019076 0.007641 0.030056 0.113835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.546834 0.035881 -15.24 <2e-16 ***
## Temp 0.026864 0.001728 15.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05125 on 51 degrees of freedom
## Multiple R-squared: 0.8257, Adjusted R-squared: 0.8223
## F-statistic: 241.6 on 1 and 51 DF, p-value: < 2.2e-16
pc3 <- lm(PC3 ~ SpCond + Days, data = pcoa_df)
summary(pc3)
##
## Call:
## lm(formula = PC3 ~ SpCond + Days, data = pcoa_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.145381 -0.057191 -0.002854 0.057312 0.116025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.8641631 0.1269292 -6.808 1.20e-08 ***
## SpCond 0.0029764 0.0004268 6.973 6.61e-09 ***
## Days 0.0005084 0.0002486 2.045 0.0462 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06842 on 50 degrees of freedom
## Multiple R-squared: 0.4935, Adjusted R-squared: 0.4732
## F-statistic: 24.35 on 2 and 50 DF, p-value: 4.125e-08
Distribution of model residuals:
hist(residuals(pc1_ph))
hist(residuals(pc2))
hist(residuals(pc3))
All distributions look normal enough.
Temporal autocorrelation of model residuals:
par(mfrow = c(3, 3))
# Look at temporal autocorrelation
n1 <- which(pcoa_df$Station == "nearshore1")
n2 <- which(pcoa_df$Station == "nearshore2")
o <- which(pcoa_df$Station == "offshore")
n1_ph <- which(!is.na(pcoa_df$pH) & pcoa_df$Station == "nearshore1")
n2_ph <- which(!is.na(pcoa_df$pH) & pcoa_df$Station == "nearshore2")
o_ph <- which(!is.na(pcoa_df$pH) & pcoa_df$Station == "offshore")
acf(residuals(pc1_ph)[n1_ph], main = "PC1 ~ pH; nearshore 1")
acf(residuals(pc1_ph)[n2_ph], main = "PC1 ~ pH; nearshore 2")
acf(residuals(pc1_ph)[o_ph], main = "PC1 ~ pH; offshore")
acf(residuals(pc2)[n1], main = "PC2 ~ Temp; nearshore 1")
acf(residuals(pc2)[n2], main = "PC2 ~ Temp; nearshore 2")
acf(residuals(pc2)[o], main = "PC2 ~ Temp; offshore")
acf(residuals(pc3)[n1], main = "PC3 ~ SpCond; nearshore 1")
acf(residuals(pc3)[n2], main = "PC3 ~ SpCond; nearshore 2")
acf(residuals(pc3)[o], main = "PC3 ~ SpCond; offshore")
There is some temporal autocorrelation in the residuals so lets look at the cv error.
par(mfrow = c(3, 3))
pacf(residuals(pc1_ph)[n1_ph], main = "PC1 ~ pH; nearshore 1")
pacf(residuals(pc1_ph)[n2_ph], main = "PC1 ~ pH; nearshore 2")
pacf(residuals(pc1_ph)[o_ph], main = "PC1 ~ pH; offshore")
pacf(residuals(pc2)[n1], main = "PC2 ~ Temp; nearshore 1")
pacf(residuals(pc2)[n2], main = "PC2 ~ Temp; nearshore 2")
pacf(residuals(pc2)[o], main = "PC2 ~ Temp; offshore")
pacf(residuals(pc3)[n1], main = "PC3 ~ SpCond; nearshore 1")
pacf(residuals(pc3)[n2], main = "PC3 ~ SpCond; nearshore 2")
pacf(residuals(pc3)[o], main = "PC3 ~ SpCond; offshore")
Cross validation error, leaving out one time point at a time:
dates <- levels(pcoa_df$Date)
loocv <- rep(0, length(dates))
names(loocv) <- dates
# PC1
for (date in dates) {
train <- filter(pcoa_df, Date != date)
test <- filter(pcoa_df, Date == date)
fit <- lm(PC1 ~ pH + Days, data = train)
pred <- predict(fit, test)
loocv[date] <- mean((test$PC1 - pred)^2)
}
mean(loocv[-18])
## [1] 0.01928425
# PC2
for (date in dates) {
train <- filter(pcoa_df, Date != date)
test <- filter(pcoa_df, Date == date)
fit <- lm(PC2 ~ Temp, data = train)
pred <- predict(fit, test)
loocv[date] <- mean((test$PC1 - pred)^2)
}
mean(loocv)
## [1] 0.05223402
# PC3
for (date in pcoa_df$Date) {
train <- filter(pcoa_df, Date != date)
test <- filter(pcoa_df, Date == date)
fit <- lm(PC3 ~ SpCond + Days, data = train)
pred <- predict(fit, test)
loocv[date] <- mean((test$PC1 - pred)^2)
}
mean(loocv)
## [1] 0.04727986
library(psych)
n = min(sample_sums(erie))
thresh = 0.001
# Calculate mean relative abundance for each OTU
tax_mean <- taxa_sums(ncbacteria)/nsamples(ncbacteria)
# Prune low abundance taxa using thresh as mean relative abundance
nc_prune <-
ncbacteria %>%
prune_taxa(tax_mean > thresh*n, .)
mc <-
erie_scale %>%
subset_taxa(Species == "Otu00005")
# Calculate correlation between microcystis and nc-bacteria
mc_corrs <- corr.test(
x = t(otu_table(mc)),
y = t(otu_table(nc_prune)),
method = "spearman",
use = "complete",
adjust = "fdr"
)
which_sigs <- which(mc_corrs$p < 0.05)
sig_otus <- colnames(mc_corrs$p)[which_sigs]
mc_corrs_dat <- data.frame(tax_table(nc_prune)) %>% filter(Species %in% sig_otus)
mc_corrs_dat$Rho <- mc_corrs$r[which_sigs]
mc_corrs_dat
## Kingdom Phylum Class Order
## 1 Bacteria Proteobacteria Betaproteobacteria Burkholderiales
## 2 Bacteria Actinobacteria Actinobacteria Actinomycetales
## 3 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales
## 4 Bacteria Actinobacteria Actinobacteria Actinomycetales
## 5 Bacteria Actinobacteria Actinobacteria Actinomycetales
## 6 Bacteria Proteobacteria Alphaproteobacteria Rhodospirillales
## 7 Bacteria Planctomycetes Planctomycetacia Planctomycetales
## 8 Bacteria Actinobacteria Actinobacteria Actinomycetales
## 9 Bacteria Bacteroidetes Flavobacteria Flavobacteriales
## 10 Bacteria Bacteroidetes Flavobacteria Flavobacteriales
## 11 Bacteria Bacteroidetes Cytophagia Cytophagales
## 12 Bacteria Verrucomicrobia OPB35_soil_group unclassified
## 13 Bacteria Bacteroidetes Flavobacteria Flavobacteriales
## 14 Bacteria Planctomycetes Planctomycetacia Planctomycetales
## 15 Bacteria Proteobacteria Alphaproteobacteria Rhodobacterales
## 16 Bacteria Actinobacteria Actinobacteria Actinomycetales
## 17 Bacteria Actinobacteria Actinobacteria Actinomycetales
## 18 Bacteria Proteobacteria Betaproteobacteria Burkholderiales
## 19 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales
## 20 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales
## 21 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales
## 22 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales
## 23 Bacteria Planctomycetes Planctomycetacia Planctomycetales
## 24 Bacteria Verrucomicrobia Spartobacteria Spartobacteriales
## 25 Bacteria Bacteroidetes Flavobacteria Flavobacteriales
## 26 Bacteria Bacteroidetes Flavobacteria Flavobacteriales
## 27 Bacteria Planctomycetes Planctomycetacia Planctomycetales
## 28 Bacteria Armatimonadetes Armatimonadia Armatimonadales
## 29 Bacteria Proteobacteria Alphaproteobacteria Caulobacterales
## 30 Bacteria Chloroflexi Chloroflexia Chloroflexales
## 31 Bacteria Verrucomicrobia Opitutae Opitutales
## 32 Bacteria Verrucomicrobia OPB35_soil_group unclassified
## 33 Bacteria Bacteroidetes Flavobacteria Flavobacteriales
## 34 Bacteria Proteobacteria Gammaproteobacteria Alteromonadales
## 35 Bacteria Verrucomicrobia Opitutae Opitutales
## 36 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales
## 37 Bacteria Planctomycetes Planctomycetacia Planctomycetales
## 38 Bacteria Actinobacteria Actinobacteria Acidimicrobiales
## 39 Bacteria Proteobacteria Alphaproteobacteria Rhodobacterales
## 40 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales
## 41 Bacteria Verrucomicrobia Verrucomicrobiae Verrucomicrobiales
## 42 Bacteria Chlorobi Chlorobia Chlorobiales
## 43 Bacteria Actinobacteria Actinobacteria Acidimicrobiales
## 44 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales
## 45 Bacteria Proteobacteria Deltaproteobacteria Myxococcales
## 46 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales
## 47 Bacteria Planctomycetes Planctomycetacia Planctomycetales
## 48 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales
## 49 Bacteria Actinobacteria Actinobacteria Acidimicrobiales
## 50 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales
## 51 Bacteria Verrucomicrobia Spartobacteria Spartobacteriales
## 52 Bacteria Proteobacteria Betaproteobacteria betV
## Family Genus Rank7
## 1 betI betI-A Lhab-A1
## 2 acI acI-C acI-C2
## 3 bacI bacI-A bacI-A1
## 4 acI acI-A unclassified
## 5 acI acI-A acI-A6
## 6 Acetobacteraceae Roseomonas <NA>
## 7 Planctomycetaceae unclassified <NA>
## 8 acI acI-A acI-A3
## 9 bacV unclassified unclassified
## 10 bacV unclassified unclassified
## 11 Cytophagaceae unclassified unclassified
## 12 unclassified unclassified <NA>
## 13 bacV unclassified unclassified
## 14 Planctomycetaceae Pirellula <NA>
## 15 Rhodobacteraceae Rhodobacter <NA>
## 16 acI acI-C acI-C2
## 17 Luna1 Luna1-A Luna1-A4
## 18 Comamonadaceae unclassified unclassified
## 19 env.OPS_17 unclassified unclassified
## 20 bacI unclassified unclassified
## 21 bacIII bacIII-A unclassified
## 22 bacI unclassified unclassified
## 23 Planctomycetaceae Planctomyces <NA>
## 24 Spartobacteriaceae CandidatusXiphinematobacter verI-A
## 25 bacII bacII-A Flavo-A1
## 26 bacII bacII-A unclassified
## 27 Planctomycetaceae Planctomyces <NA>
## 28 Armatimonadaceae Armatimonas unclassified
## 29 Caulobacteraceae Phenylobacterium <NA>
## 30 Roseiflexaceae Roseiflexus <NA>
## 31 Opitutaceae unclassified unclassified
## 32 unclassified unclassified <NA>
## 33 bacV unclassified unclassified
## 34 Alteromonadaceae BD1-7_clade <NA>
## 35 Opitutaceae unclassified unclassified
## 36 bacVI unclassified unclassified
## 37 Planctomycetaceae Pirellula <NA>
## 38 acIV acIV-D Iamia
## 39 Rhodobacteraceae unclassified <NA>
## 40 NS11-12_marine_group unclassified unclassified
## 41 Verrucomicrobiaceae unclassified <NA>
## 42 OPB56 unclassified <NA>
## 43 acIV unclassified unclassified
## 44 bacIV bacIV-B Aquir
## 45 0319-6G20 unclassified <NA>
## 46 bacVI unclassified unclassified
## 47 Planctomycetaceae Blastopirellula <NA>
## 48 bacI unclassified unclassified
## 49 acIV acIV-C unclassified
## 50 NS11-12_marine_group unclassified <NA>
## 51 Spartobacteriaceae CandidatusXiphinematobacter unclassified
## 52 betV-A betV-A1 unclassified
## Rank8 Species Rho
## 1 unclassified Otu00002 -0.3972260
## 2 unclassified Otu00006 0.6541005
## 3 unclassified Otu00008 -0.4389629
## 4 unclassified Otu00010 -0.4214664
## 5 unclassified Otu00011 -0.4918456
## 6 <NA> Otu00012 0.6951876
## 7 <NA> Otu00014 0.6052109
## 8 unclassified Otu00016 -0.4071931
## 9 unclassified Otu00017 -0.4733574
## 10 unclassified Otu00023 -0.3285210
## 11 unclassified Otu00024 0.7050750
## 12 <NA> Otu00025 -0.3257464
## 13 unclassified Otu00028 -0.3204986
## 14 <NA> Otu00029 0.5718519
## 15 <NA> Otu00030 0.8631638
## 16 unclassified Otu00032 0.6418170
## 17 unclassified Otu00036 -0.5039319
## 18 unclassified Otu00041 0.6678786
## 19 unclassified Otu00043 0.3989192
## 20 unclassified Otu00047 -0.6929731
## 21 unclassified Otu00048 -0.5743878
## 22 unclassified Otu00052 0.6502580
## 23 <NA> Otu00054 0.5176195
## 24 unclassified Otu00056 0.4261430
## 25 unclassified Otu00057 0.4464391
## 26 unclassified Otu00070 -0.6599055
## 27 <NA> Otu00073 0.6197799
## 28 unclassified Otu00077 -0.6003231
## 29 <NA> Otu00078 0.8491539
## 30 <NA> Otu00081 0.5789068
## 31 unclassified Otu00084 -0.4491541
## 32 <NA> Otu00092 0.4823769
## 33 unclassified Otu00094 -0.3946931
## 34 <NA> Otu00095 0.3467692
## 35 unclassified Otu00099 -0.3435773
## 36 unclassified Otu00113 -0.4906489
## 37 <NA> Otu00115 0.5510557
## 38 unclassified Otu00123 0.3381007
## 39 <NA> Otu00142 -0.4395983
## 40 unclassified Otu00144 0.3723329
## 41 <NA> Otu00153 0.5679166
## 42 <NA> Otu00159 0.7065858
## 43 unclassified Otu00168 0.5823920
## 44 unclassified Otu00175 0.3689566
## 45 <NA> Otu00189 0.3992416
## 46 unclassified Otu00208 0.3326477
## 47 <NA> Otu00213 0.6243399
## 48 unclassified Otu00225 0.5072148
## 49 unclassified Otu00232 -0.5309842
## 50 <NA> Otu00245 0.6375114
## 51 unclassified Otu00290 0.5884746
## 52 unclassified Otu01489 0.8583584
# Calculate correlation between phycocyanin and nc-bacteria
cyano_corrs <- corr.test(
x = data.frame(pcoa_df$Phycocyanin),
y = t(otu_table(nc_prune)),
method = "spearman",
use = "complete",
adjust = "fdr"
)
which_sigs <- which(cyano_corrs$p < 0.05)
sig_otus <- colnames(cyano_corrs$p)[which_sigs]
cyano_corrs_dat <- data.frame(tax_table(nc_prune)) %>% filter(Species %in% sig_otus)
cyano_corrs_dat$Rho <- cyano_corrs$r[which_sigs]
nc_prune %>%
tax_glom(taxrank = "Family") %>%
psmelt() %>%
group_by(Family) %>%
summarise(mean = mean(Abundance)) %>%
arrange(desc(mean))
## # A tibble: 44 x 2
## Family mean
## <fctr> <dbl>
## 1 acI 3390.6604
## 2 bacI 1063.3019
## 3 betI 686.2264
## 4 Planctomycetaceae 647.2075
## 5 bacV 467.3962
## 6 betIV 447.0189
## 7 betII 271.0377
## 8 acTH1 241.8302
## 9 acIV 235.3396
## 10 bacIII 229.1321
## # ... with 34 more rows
AcI is the most abundant clade
groups <- c("acI", "bacI", "betI", "Planctomycetaceae", "bacV")
nc_melt <- nc_prune %>%
transform_sample_counts(function(x) {x/sum(x)} ) %>%
psmelt() %>%
order_dates
group_otu_plots <- lapply(groups, function(x) {
group_df <-
nc_melt %>%
filter(Family == x)
group_otus <- unique(group_df$Species)
group_plots <- lapply(group_otus, function(y) {
df_otu <- filter(group_df, OTU == y)
plot <- plot_otus(df = df_otu, otu = y, taxrank = "Genus")
})
return(group_plots)
})
do.call(grid.arrange, group_otu_plots[[1]])
do.call(grid.arrange, group_otu_plots[[2]])
do.call(grid.arrange, group_otu_plots[[3]])
do.call(grid.arrange, group_otu_plots[[4]])
do.call(grid.arrange, group_otu_plots[[5]])
# AcI plot
aci_plots <- group_otu_plots[[1]]
grid.arrange(
aci_plots[[3]], aci_plots[[4]], aci_plots[[5]], aci_plots[[8]],
aci_plots[[1]], aci_plots[[2]], aci_plots[[6]], aci_plots[[7]],
station_legend,
ncol = 4, nrow = 3
)